Variables:
Risk
Money
Security
Good time Help Success Proper Environment Tradition Creativity
Friends important Family important Leisure time Happiness Health (subjective) Satisfaction Freedom
Sex Age Country Wave Marital status Children Employment Education
Do we want to combine the data? Find about the overlap: countries in both cases? Risk item –> revert facet plot: plot by country 20 countries in a plot Order countries by continent? Scatterplot: (average age, average risk) Average older or younger population in countries–> more risk? Mixed effects model with hierarchy (people clustered into countries) What are we trying to find out?
Regression that considers countries Average / country effect x: risk taking; y: age (plot by how steep the slope is –> regresion for each country) Slope/intercept –> plot Mixed effect model –> extract mixed and random effect model (save intercept and slope) Able to visualize Indep. regression for each country for each data sets Table with countries, N, intercept and slope (risk and age), effect for age, effect for gender Hardship Index? Do we see the same effect in the countries? What visualization? –> Goal? Rationale? Do a sketch of what we want to do 1.) plot age 2. ) How much data per country? Age distribution per country? (include age and gender in plots) Work on the list!!
library(data.table)
library(tidyr)
#read the data (Wave 5)
# Data of Wave 5
WV5_data <- readRDS("/Users/cristinacandido/Documents/Github/risk_wvs/data/WVS/F00007944-WV5_Data_R_v20180912.rds")
# Convert WV5_data-object in data.frame
WV5_data_df <- as.data.frame(WV5_data)
# show first five columns
head(WV5_data_df[, 1:5])
library(dplyr)
#rename the variables
WV5_data <- WV5_data_df %>%
rename(sex = V235, age = V237, country = V2, wave = V1, risk = V86)
WV5_data
#select only the variables of interest
WV5_data <- WV5_data %>%
select(sex, age, country, wave, risk)
WV5_data
#decode the country names
countrynames = read.csv("/Users/cristinacandido/Documents/Github/risk_wvs/data/WVS/countrynames.txt", header=FALSE,as.is=TRUE)
colnames(countrynames) = c("code", "name")
WV5_data$country_lab = countrynames$name [match(WV5_data$country, countrynames$code)]
table(WV5_data$country_lab)
Andorra Argentina Australia Brazil Bulgaria Burkina Faso Canada
1003 1002 1421 1500 1001 1534 2164
Chile China Colombia Cyprus (G) Egypt Ethiopia Finland
1000 1991 3025 1050 3051 1500 1014
France Georgia Germany Ghana Great Britain Guatemala Hong Kong
1001 1500 2064 1534 1041 1000 1252
Hungary India Indonesia Iran Iraq Italy Japan
1007 2001 2015 2667 2701 1012 1096
Jordan Malaysia Mali Mexico Moldova Morocco Netherlands
1200 1201 1534 1560 1046 1200 1050
New Zealand Norway Peru Poland Romania Russia Rwanda
954 1025 1500 1000 1776 2033 1507
Slovenia South Africa South Korea Spain Sweden Switzerland Taiwan
1037 2988 1200 1200 1003 1241 1227
Thailand Trinidad and Tobago Turkey Ukraine United States Uruguay Viet Nam
1534 1002 1346 1000 1249 1000 1495
Zambia
1500
WV5_data
NA
NA
#Read Dataset (Wave 6)
WV6_data <- load("/Users/cristinacandido/Documents/Github/risk_wvs/data/WVS/WV6_Data_R_v20201117.rdata")
WV6_data <- WV6_Data_R_v20201117
print(WV6_data)
` ``{r} #rename variables
WV6_data <- WV6_data %>%
rename(wave = V1, sex = V240, age = V242,country = V2, risk = V76)
#select only the variables of interest
WV6_data <- WV6_data %>%
select(wave, sex, age, country, sex,risk)
WV6_data
NA
#decode daraset (Wave 6)
countrynames = read.csv("/Users/cristinacandido/Documents/Github/risk_wvs/data/WVS/countrynames.txt", header=FALSE,as.is=TRUE)
colnames(countrynames) = c("code", "name")
WV6_data$country_lab = countrynames$name [match(WV6_data$country, countrynames$code)]
table(WV6_data$country_lab)
Algeria Argentina Armenia Australia Azerbaijan Belarus Brazil Chile
1200 1030 1100 1477 1002 1535 1486 1000
China Colombia Cyprus (G) Ecuador Egypt Estonia Georgia Germany
2300 1512 1000 1202 1523 1533 1202 2046
Ghana Haiti Hong Kong India Iraq Japan Jordan Kazakhstan
1552 1996 1000 4078 1200 2443 1200 1500
Kuwait Kyrgyzstan Lebanon Libya Malaysia Mexico Morocco Netherlands
1303 1500 1200 2131 1300 2000 1200 1902
New Zealand Nigeria Pakistan Palestine Peru Philippines Poland Qatar
841 1759 1200 1000 1210 1200 966 1060
Romania Russia Rwanda Singapore Slovenia South Africa South Korea Spain
1503 2500 1527 1972 1069 3531 1200 1189
Sweden Taiwan Thailand Trinidad and Tobago Tunisia Turkey Ukraine United States
1206 1238 1200 999 1205 1605 1500 2232
Uruguay Uzbekistan Yemen Zimbabwe
1000 1500 1000 1500
WV6_data
#combine the 2 dataset (Wave 6 + Wave 5)
WV5_data
WV6_data
data = rbind(WV5_data, WV6_data)
data
#number of countries
length(unique(data$country_lab))
[1] 78
#number of participants
nrow(data)
[1] 173540
#exclusion of participants
data = subset(data, risk > 0 & sex > 0 & age > 0)
length(unique(data$country_lab))
[1] 78
data
data$risk = 6 - data$risk + 1
data$risk_ord=data$risk
data$risk = 10*scale(data$risk, center=TRUE,scale=TRUE)+50
data
NA
#number of males vs females (1 = males; 2 = females)
table(data$sex)
1 2
47262 50079
#create a categorical age variable
data$agecat[data$age<20]="15-19"
data$agecat[data$age>=20 & data$age <30] = "20-29"
data$agecat[data$age>=30 & data$age <40] = "30-39"
data$agecat[data$age>=40 & data$age <50] = "40-49"
data$agecat[data$age>=50 & data$age <60] = "50-59"
data$agecat[data$age>=60 & data$age <70] = "60-69"
data$agecat[data$age>=70 & data$age <80] = "70-79"
data$agecat[data$age>=80] = "80+"
#gender variables
data$sex[data$sex == 1] <- "male"
data$sex[data$sex == 2] <- "female"
#average age of participants
mean(data$age)
[1] 41.62714
#education variables
attach(data)
data$education_cat[education < 3] = "incomplete or no primary education"
data$education_cat[education > 2 & education <= 6] <- "no uni"
data$education_cat[education >= 7] <- "uni"
detach(data)
table(data$education)
data
#wave variables
data$wave[data$wave == 5] <- "Wave 5"
data$wave[data$wave == 6] <- "Wave 6"
data
#age range
range(data$age)
[1] 15 99
#risk taking Frequency
library(ggplot2)
ggplot(data, aes(x = risk)) +
geom_histogram(binwidth = 0.5, fill = "lightblue", color = "black") +
labs(x = "Risk Taking", y = "Frequency", title = "Histogram of Risk Taking") +
theme_minimal()
#age frequency
ggplot(data, aes(x = age)) +
geom_histogram(binwidth = 0.5, fill = "lightblue", color = "black") +
labs(x = "Age", y = "Frequency", title = "Histogram of Age Distributionn") +
theme_minimal()
#age vs risk taking
ggplot(data, aes(x = agecat, y = risk)) +
geom_boxplot() +
labs(title = "Boxplot of Risk and Adventure by Age",
x = "Age",
y = "Risk and Adventure") +
theme_minimal()
NA
NA
#sex vs risk taking
ggplot(data, aes(as.factor(sex), risk))+
geom_boxplot()
#descriptive data
summary(data)
sex age country wave risk.V1 country_lab agecat risk_ord.V1
Length:156528 Min. : 15.00 Min. : 12.0 Min. :5.000 Min. :36.15444 Length:156528 Length:156528 Min. :-56.84556
Class :character 1st Qu.: 28.00 1st Qu.:276.0 1st Qu.:5.000 1st Qu.:42.42999 Class :character Class :character 1st Qu.:-50.57001
Mode :character Median : 39.00 Median :466.0 Median :6.000 Median :48.70553 Mode :character Mode :character Median :-44.29447
Mean : 41.62 Mean :477.4 Mean :5.551 Mean :50.00677 Mean :-42.99323
3rd Qu.: 54.00 3rd Qu.:710.0 3rd Qu.:6.000 3rd Qu.:54.98108 3rd Qu.:-38.01892
Max. :102.00 Max. :894.0 Max. :6.000 Max. :67.53218 Max. :-25.46782
#data cleaning: deletion of NAs
data = na.omit(data)
summary(data)
sex age country wave risk.V1 country_lab agecat risk_ord.V1
Length:156528 Min. : 15.00 Min. : 12.0 Min. :5.000 Min. :36.15444 Length:156528 Length:156528 Min. :-56.84556
Class :character 1st Qu.: 28.00 1st Qu.:276.0 1st Qu.:5.000 1st Qu.:42.42999 Class :character Class :character 1st Qu.:-50.57001
Mode :character Median : 39.00 Median :466.0 Median :6.000 Median :48.70553 Mode :character Mode :character Median :-44.29447
Mean : 41.62 Mean :477.4 Mean :5.551 Mean :50.00677 Mean :-42.99323
3rd Qu.: 54.00 3rd Qu.:710.0 3rd Qu.:6.000 3rd Qu.:54.98108 3rd Qu.:-38.01892
Max. :102.00 Max. :894.0 Max. :6.000 Max. :67.53218 Max. :-25.46782
#risk distribution according to Waves 5 and 6
ggplot(data, aes(as.factor(wave), risk))+
geom_boxplot()
#risk vs age
library(ggplot2)
ggplot(data, aes(risk, age))+
geom_point()+
geom_smooth(method = "lm")
data1 <- subset(data, country_lab %in% c("Andorra", "Romania", "Spain"))
ggplot(data1, aes(as.factor(country_lab), risk))+
geom_boxplot()
#happiness vs risk depending on sex
ggplot(data, aes(age, risk, color = as.factor(country_lab)))+
geom_point()+
geom_smooth(method = "lm", se = TRUE)
ncol(countryfacts) # this will print the number of columns in countryfacts
[1] 9
length(labels) # this will print the length of the labels vector
[1] 10
```